################################################################################
# SCRIPT: Analysis of survey data
#
# NOTES: Surveys used:
#        (i) Language use in bilingual communities - MD2052, MD2296
#        (ii) Social and political situation in the BC - MD2096, MD2282, MD2407, MD2593
################################################################################

################################################################################
##### Packages and directories #####
################################################################################

## Add your own root directory
#setwd("Replication package/Survey")


## Clean environment
rm(list=ls())

## Packages
library(tidyverse)
library(labelled)
library(ggpubr)

## Input directory
indir <- "Data/"

## Output directory
outdir <- "Output/Figures/"



################################################################################
##### Figures H1, H2, H3, H4, and H5: Descriptives on language and identity #####
################################################################################
## Surveys used: MD2052 (1993), MD2296 (1998) - "Use of language in bilingual communities"

## Import the data
load(paste0(indir, "CIS_lang.RData"))

## Recoding: language of use in parental household
cis <- cis %>% mutate(language_parental = case_when(language_parental == 1 ~ "Euskera",
                                                    language_parental == 2 ~ "Spanish",
                                                    language_parental == 3 ~ "Bilingual",
                                                    language_parental %in% c(0,4,9) ~ NA_character_))


## Birth cohorts
cis$birth_year <-  cis$svy_year - cis$age
cis <- mutate(cis, cohort = case_when(birth_year >= 1904 & birth_year < 1930 ~ "1904-1929",
                                      birth_year >= 1930 & birth_year < 1945 ~ "1930-1944",
                                      birth_year >= 1945 & birth_year < 1960 ~ "1945-1959",
                                      birth_year >= 1960 & birth_year < 1975 ~ "1960-1974",
                                      birth_year >= 1975 ~ "1975-1980"))


##### Figure H4: Language in parental household, by cohort #####
cis %>% 
  filter(!is.na(language_parental) & province_birth%in%c(1, 20, 48)) %>%
  group_by(cohort, language_parental) %>% count() %>%
  ggplot(aes(x=cohort, y=n, fill=language_parental)) +
  geom_col(color="black", stat="identity") +
  labs(x="Birth cohort", y="N") +
  theme_bw() + theme(axis.text.x = element_text(face="bold", size=8, angle=45, hjust=1)) +
  scale_fill_manual(name="Language in \n household", values = c("White", "Grey", "Black"))

## Save
ggsave(filename = "figureH4.png",
       path = outdir,
       width = 12, units = "cm")
  



##### Figure H1: What differentiates more BC from other regions of Spain (1st answer) #####
#  Note the question was asked only in the first wave (1993)
cis %>% filter(basquediff_1st < 98 & !is.na(basquediff_1st)) %>%
  #Recode the "none" category to have closer breaks
  mutate(basquediff_1st = replace(basquediff_1st, basquediff_1st==97, 11)) %>%
  ggplot(aes(x=basquediff_1st)) + geom_bar(color="black", fill="white") +
  scale_x_continuous(breaks=seq(1, 11, 1), labels=c("Geography", "Climate", "Language", "History", "Customs and tradition", "Folklore", "People's personality", "Political attitudes", "Religiosity", "Other", "None"), limits=c(0,12)) +
  coord_flip() + 
  labs(x="", y="N", title="Difference between BC and rest of Spain", subtitle="1st answer") +
  theme_bw()

## Save
ggsave(filename = "figureH1.pdf",
       path = outdir,
       width = 16, units = "cm")

## Look at the 2nd answer (mentioned in App. H)
cis %>% filter(basquediff_2nd < 98 & !is.na(basquediff_2nd)) %>%
  mutate(basquediff_2nd = replace(basquediff_2nd, basquediff_2nd==97, 11)) %>%
  ggplot(aes(x=basquediff_2nd)) + geom_bar(color="black", fill="white") +
  scale_x_continuous(breaks=seq(1, 11, 1), labels=c("Geography", "Climate", "Language", "History", "Customs and tradition", "Folklore", "People's personality", "Political attitudes", "Religiosity", "Other", "None"), limits=c(0,12)) +
  coord_flip() + 
  labs(x="", y="N", title="Difference between BC and rest of Spain", subtitle="2nd answer") +
  theme_bw()


##### Figure H2: What is necessary to consider oneself Basque #####
cis %>% filter(across(starts_with("basqueif_"), ~ .x %in% c(1,2,8))) %>%
  select(starts_with("basqueif_")) %>%
  sapply(.,table) %>%
  as.data.frame() %>%
  rownames_to_column(var = "answer") %>%
  pivot_longer(!answer, names_to = "criterion", values_to = "total") %>%
  mutate(criterion = case_when(criterion == "basqueif_liveandwork" ~ "Live and work in BC",
                               criterion == "basqueif_speakeuskera" ~ "Speak Euskera",
                               criterion == "basqueif_basquefam" ~ "Basque family",
                               criterion == "basqueif_bornbc" ~ "Born in BC",
                               criterion == "basqueif_willing" ~ "Willingness to be Basque",
                               criterion == "basqueif_nationalist" ~ "Nationalist feelings")) %>%
  mutate(answer = case_when(answer == 1 ~ "Yes",
                            answer == 2 ~ "No",
                            answer == 8 ~ "Don't know")) %>%
  ggplot(aes(x=criterion, y=total, fill=answer)) + geom_bar(stat="identity", color="black") +
  coord_flip() +
  labs(x="", y="N") +
  scale_fill_manual(name="", values = c("White", "Grey", "Black")) +
  theme_minimal()

## Save
ggsave(filename = "figureH2.pdf",
       path = outdir,
       width = 14, units = "cm") 



##### Figure H3: Correlation between language proficiency and Basque identity (5 pts scale) #####
# Nivel de conocimiento del euskera
# 1.- Lo entiende, lo habla, lo lee y lo escribe
# 2.- Lo entiende, lo habla y lo lee
# 3.- Lo entiende y lo habla
# 4.- Sólo lo entiende
# 5.- Ni lo habla ni lo entiende
# 9.- N.C.


# Sentimiento nacionalista del País Vasco
# 1.- Me siento únicamente español
# 2.- Me siento más español que vasco
# 3.- Me siento tan español como vasco
# 4.- Me siento más vasco que español
# 5.- Me siento únicamente vasco
# 8.- N.S.
# 9.- N.C.

cis %>%
  select(c(euskera_knowledge, basque_id_5p)) %>%
  filter(euskera_knowledge < 9 & basque_id_5p < 8) %>%
  # rescale so that increases in proficiency
  mutate(euskera_knowledge = 6-euskera_knowledge) %>% 
  group_by(euskera_knowledge) %>% mutate(avg_id = mean(basque_id_5p, na.rm=T)) %>% ungroup() %>%
  ggplot() + geom_smooth(mapping=aes(x=euskera_knowledge, y=basque_id_5p), method="lm") +
  geom_point(mapping = aes(x=euskera_knowledge, y=avg_id)) + 
  scale_x_continuous(breaks=seq(1:5), limits=c(1,5)) +
  scale_y_continuous(breaks=seq(1:5), limits=c(1,5)) +
  labs(x="Knowledge of euskera", y="Relative Basque ID") +
  theme_bw()

## Save
ggsave(filename = "figureH3.pdf",
       path = outdir,
       width = 12, units = "cm")

## Correlation coefficient (mentioned in text)
temp <- cis %>%
  select(c(euskera_knowledge, basque_id_5p)) %>%
  filter(euskera_knowledge < 9 & basque_id_5p < 8) %>%
  # rescale so that increases in proficiency
  mutate(euskera_knowledge = 6-euskera_knowledge)
cor(temp$euskera_knowledge, temp$basque_id_5p, use="pairwise")
rm(temp)


##### Figure H5: Correlation between religiosity and nationalism #####
#  Note we use only the first wave (1993)

# Religiosidad de la persona entrevistada
# 1.- Muy buen católico
# 2.- Católico practicante
# 3.- Católico poco practicante
# 4.- Católico no practicante
# 5.- Creyente de otra religión
# 6.- Indiferente
# 7.- Ateo
# 9.- N.C.

cis %>%
  # Drop the "other religion" category
  filter(basque_id_10p < 98 & !(religiosity %in% c(5,9)) & !is.na(religiosity)) %>%
  mutate(religiosity = remove_val_labels(religiosity)) %>%
  #rescale to have constant break values
  mutate(religiosity = replace(religiosity, religiosity==6, 5)) %>%
  mutate(religiosity = replace(religiosity, religiosity==7, 6)) %>%
  #rescale so that increases in catholicism
  mutate(religiosity = 7-religiosity) %>% 
  ggplot(aes(x=basque_id_10p, y=religiosity)) + geom_smooth(method="lm") +
  scale_x_continuous(breaks=c(1,5,10), limits=c(0,10)) +
  labs(x="Basque nationalism scale", y="Catholicism") +
  facet_wrap(~ cohort, nrow=1) +
  theme_bw()

## Save
ggsave(filename = "figureH5.pdf",
       path = outdir,
       width = 16, units = "cm")

################################################################################
##### Figure H6 and 7: Descriptives on identity and ideology #####
################################################################################
## Surveys used: MD2096 (1994), MD2282 (1998), MD2407 (2001), MD2593 (2005)  - "Social and political situation in the BC" 


## Import the data
load(paste0(indir, "CIS_pol.RData"))


## Birth cohorts
cis_pol$birth_year <- cis_pol$svy_year - cis_pol$age
cis_pol <- mutate(cis_pol, cohort = case_when(birth_year >= 1895 & birth_year < 1930 ~ "1895-1929",
                                              birth_year >= 1930 & birth_year < 1945 ~ "1930-1944",
                                              birth_year >= 1945 & birth_year < 1960 ~ "1945-1959",
                                              birth_year >= 1960 & birth_year < 1975 ~ "1960-1974",
                                              birth_year >= 1975 ~ "1975-1987"))



## Append the data from language use survey and the data from political survey
aggr <- bind_rows(select(cis, c(ideological_10p, basque_id_10p, cohort)),
                  select(cis_pol, c(ideological_10p, basque_id_10p, cohort)))

## Edit the cohort label: the political surveys have larger N, so cohorts are 
## wider. All respondents across surveys are pooled together
aggr$cohort <- ifelse(aggr$cohort == "1975-1980", "1975-1987", aggr$cohort)
aggr$cohort <- ifelse(aggr$cohort == "1904-1929", "1895-1929", aggr$cohort)


## Distribution of Basque nationalism, by cohort
identity <- aggr %>% filter(basque_id_10p < 98  & !is.na(cohort)) %>%
  ggplot(aes(x=basque_id_10p)) + geom_density(color="black", fill="darkgoldenrod1") +
  scale_x_continuous(breaks = c(1,5,10), labels = c("Min. nationalism", "", "Max. nationalism"), limits = c(0,11))+
  labs(x="Basque nationalism scale", y="") +
  facet_grid(rows = vars(cohort)) +
  theme_bw()

## Distribution of Left-Right ideology, by cohort
ideology <- aggr %>% filter(ideological_10p < 98 & !is.na(cohort)) %>%
  ggplot(aes(x=ideological_10p)) + geom_density(color="black", fill="brown2") +
  scale_x_continuous(breaks=c(1,5,10), labels=c("Left", "", "Right"), limits=c(0,11)) +
  labs(x="Ideology scale", y="") +
  facet_grid(rows=vars(cohort)) +
  theme_bw()

##### Figure H6: Plot together #####
ggarrange(ideology, identity,
          ncol=2) %>% annotate_figure(left="Density")

## Save
ggsave(filename = "figureH6.pdf",
       path = outdir,
       width = 16, height=12, units = "cm")

##### Figure 7: Correlation between nationalism and ideology, by cohort #####
aggr %>% filter(basque_id_10p < 98 & ideological_10p < 98 & !is.na(cohort)) %>%
  ggplot(aes(x=basque_id_10p, y=ideological_10p)) + geom_smooth(method="lm") +
  scale_x_continuous(breaks=c(1,5,10), limits=c(0,11)) +
  scale_y_continuous(breaks=seq(1:10)) +
  labs(x="Basque nationalism scale", y="Ideology scale") +
  facet_grid(cols = vars(cohort)) +
  theme_bw()

## Save
ggsave(filename = "figure7.pdf",
       path = outdir,
       width = 16, units = "cm")


################################################################################
##### Congruence between national and regional vote (mentioned in text) #####
################################################################################

## Same choice between vote in regional (autonomic) and national elections
temp <- cis_pol %>%
  # drop those who don't give party indications 
  filter(voted_last_aut<11 & voted_last_nat<11) 
  

# Adjust single years so that parties have the same code each year
temp$voted_last_aut[temp$svy_year==1998 & temp$voted_last_aut>6] <- NA
temp$voted_last_nat[temp$svy_year==1998 & temp$voted_last_nat>6] <- NA 

temp$voted_last_aut[temp$svy_year==2005 & temp$voted_last_aut>5] <- NA
temp$voted_last_nat[temp$svy_year==2005 & temp$voted_last_nat>5] <- NA

temp$voted_last_aut[temp$svy_year==2005 & temp$voted_last_aut==5] <- NA

# In 2005 give EA the same code of PNV because they ran together in
# Euskadi but separate in Spain
temp$voted_last_nat[temp$svy_year==2005 & temp$voted_last_nat==5] <- 1

# Keep people who report both votes
temp <- filter(temp, !is.na(voted_last_aut) & !is.na(voted_last_nat))

# How many people vote the same among those who voted a party and said it?
temp$same <- temp$voted_last_aut == temp$voted_last_nat
prop.table(table(temp$same))*100

rm(temp)
